Data

Which years are missing a metric

dat %>% 
  ggplot(aes(year, lakeid, fill=is.na(daynum))) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~metric, nrow=6) 

Remaining data issues:

  • Northern lakes don’t have DOC until until 1986; missing first 4 years when all other variables are available. Potential fixes:
    • Exclude DOC; start analyses in 1986; fill DOC with e.g., median value, or model of other variables?
  • Southern lakes missing chlorophyll data: 1996 - 1999 and 2002 - 2004; 2002-04 is bad/uncalibrated values, used DOY but not magnitude; earlier years fill with median
  • Zoop data issues:
    • CB missing daphnia peak 1995(?): fill with median
    • FI missing several years from 1996 - 2006; fill with median

Predict DOC daynum from other variable daynum’s in each northern lake

no_dups = dat %>% group_by(lakeid, year, metric) %>% summarise(N = n()) %>% filter(N == 1)
## `summarise()` has grouped output by 'lakeid', 'year'. You can override using
## the `.groups` argument.
n_lakes_wide = left_join(no_dups, dat) %>% 
  select(-sampledate) %>% 
  pivot_wider(names_from = metric, values_from = daynum) %>% 
  filter(lakeid %in% c("AL", "BM", "CB", "CR", "SP", "TB", "TR"))
## Joining, by = c("lakeid", "year", "metric")
hold_data = na.omit(data.frame(n_lakes_wide %>%  ungroup() %>% select(-year, -N))) # , -doc_predict
all = lm(doc_epiMax~(iceon+straton+stratoff+energy+
                 stability+anoxia_summer+
                 secchi_openwater_max)*lakeid, data=hold_data)
i_o = lm(doc_epiMax~1, data=hold_data)
hold_step = step(i_o, scope=formula(all))
## Start:  AIC=1761.44
## doc_epiMax ~ 1
## 
##                        Df Sum of Sq    RSS    AIC
## + lakeid                6     74137 395186 1733.7
## + stratoff              1     15696 453627 1755.6
## <none>                              469323 1761.4
## + secchi_openwater_max  1      1334 467989 1762.8
## + iceon                 1       565 468758 1763.2
## + straton               1       531 468792 1763.2
## + anoxia_summer         1       343 468980 1763.3
## + stability             1       143 469180 1763.4
## + energy                1        57 469266 1763.4
## 
## Step:  AIC=1733.72
## doc_epiMax ~ lakeid
## 
##                        Df Sum of Sq    RSS    AIC
## + stratoff              1      6445 388741 1731.9
## <none>                              395186 1733.7
## + energy                1      3404 391782 1733.7
## + anoxia_summer         1      2414 392772 1734.3
## + iceon                 1       595 394591 1735.4
## + straton               1       167 395019 1735.6
## + secchi_openwater_max  1        60 395126 1735.7
## + stability             1        25 395161 1735.7
## - lakeid                6     74137 469323 1761.4
## 
## Step:  AIC=1731.93
## doc_epiMax ~ lakeid + stratoff
## 
##                        Df Sum of Sq    RSS    AIC
## + energy                1      4173 384568 1731.4
## <none>                              388741 1731.9
## + anoxia_summer         1      2654 386087 1732.3
## + iceon                 1       413 388328 1733.7
## - stratoff              1      6445 395186 1733.7
## + straton               1       270 388470 1733.8
## + stability             1       113 388628 1733.9
## + secchi_openwater_max  1        26 388715 1733.9
## + stratoff:lakeid       6     11338 377402 1737.1
## - lakeid                6     64886 453627 1755.6
## 
## Step:  AIC=1731.43
## doc_epiMax ~ lakeid + stratoff + energy
## 
##                        Df Sum of Sq    RSS    AIC
## <none>                              384568 1731.4
## + anoxia_summer         1      2646 381922 1731.8
## - energy                1      4173 388741 1731.9
## + straton               1      1294 383274 1732.7
## + stability             1       382 384186 1733.2
## + iceon                 1       133 384435 1733.3
## + secchi_openwater_max  1        98 384470 1733.4
## - stratoff              1      7214 391782 1733.7
## + stratoff:lakeid       6     10616 373952 1737.0
## + energy:lakeid         6      1353 383215 1742.6
## - lakeid                6     68295 452863 1757.2
doc_model = lm(doc_epiMax~(iceon+straton+stratoff+energy+
                 stability+anoxia_summer+
                 secchi_openwater_max)*lakeid, 
               data=n_lakes_wide, 
               na.action = na.exclude)
summary(doc_model)
## 
## Call:
## lm(formula = doc_epiMax ~ (iceon + straton + stratoff + energy + 
##     stability + anoxia_summer + secchi_openwater_max) * lakeid, 
##     data = n_lakes_wide, na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -146.219  -17.140    3.587   21.164  118.872 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                    9.300e+01  1.720e+02   0.541   0.5893  
## iceon                          2.715e-01  3.632e-01   0.748   0.4557  
## straton                        1.737e-01  3.632e-01   0.478   0.6330  
## stratoff                       4.151e-01  3.326e-01   1.248   0.2137  
## energy                        -2.294e-01  2.246e-01  -1.021   0.3085  
## stability                     -9.092e-02  1.451e-01  -0.627   0.5318  
## anoxia_summer                 -9.641e-02  1.644e-01  -0.586   0.5583  
## secchi_openwater_max          -7.737e-02  8.262e-02  -0.936   0.3503  
## lakeid.L                       1.585e+02  4.570e+02   0.347   0.7292  
## lakeid.Q                       4.298e+02  4.414e+02   0.974   0.3315  
## lakeid.C                       3.833e+02  4.587e+02   0.836   0.4044  
## lakeid^4                      -1.147e+01  4.700e+02  -0.024   0.9806  
## lakeid^5                      -2.263e+02  4.604e+02  -0.491   0.6237  
## lakeid^6                       2.994e+02  4.418e+02   0.678   0.4987  
## iceon:lakeid.L                -1.068e+00  9.637e-01  -1.108   0.2692  
## iceon:lakeid.Q                -5.440e-01  9.478e-01  -0.574   0.5667  
## iceon:lakeid.C                -6.462e-01  9.570e-01  -0.675   0.5004  
## iceon:lakeid^4                 2.688e-01  9.953e-01   0.270   0.7874  
## iceon:lakeid^5                 2.241e-01  9.502e-01   0.236   0.8138  
## iceon:lakeid^6                -2.166e-01  9.514e-01  -0.228   0.8202  
## straton:lakeid.L              -7.965e-01  9.864e-01  -0.807   0.4204  
## straton:lakeid.Q              -2.387e-02  8.998e-01  -0.027   0.9789  
## straton:lakeid.C              -4.402e-02  9.864e-01  -0.045   0.9645  
## straton:lakeid^4               9.142e-01  1.025e+00   0.892   0.3736  
## straton:lakeid^5               1.250e+00  9.865e-01   1.267   0.2069  
## straton:lakeid^6               6.786e-01  8.723e-01   0.778   0.4376  
## stratoff:lakeid.L              4.187e-01  8.989e-01   0.466   0.6419  
## stratoff:lakeid.Q             -4.273e-01  7.939e-01  -0.538   0.5911  
## stratoff:lakeid.C             -2.607e-01  9.036e-01  -0.288   0.7733  
## stratoff:lakeid^4             -4.354e-01  9.746e-01  -0.447   0.6556  
## stratoff:lakeid^5              6.122e-01  9.083e-01   0.674   0.5012  
## stratoff:lakeid^6             -1.638e+00  7.856e-01  -2.085   0.0384 *
## energy:lakeid.L                1.945e-01  6.025e-01   0.323   0.7471  
## energy:lakeid.Q               -2.106e-01  5.697e-01  -0.370   0.7121  
## energy:lakeid.C               -5.028e-02  5.939e-01  -0.085   0.9326  
## energy:lakeid^4               -1.263e-01  6.430e-01  -0.196   0.8446  
## energy:lakeid^5               -4.185e-01  5.853e-01  -0.715   0.4756  
## energy:lakeid^6                3.159e-02  5.681e-01   0.056   0.9557  
## stability:lakeid.L             1.413e-03  3.542e-01   0.004   0.9968  
## stability:lakeid.Q             3.411e-02  3.761e-01   0.091   0.9278  
## stability:lakeid.C            -1.242e-01  3.879e-01  -0.320   0.7491  
## stability:lakeid^4            -3.883e-01  3.487e-01  -1.113   0.2670  
## stability:lakeid^5             3.507e-02  4.190e-01   0.084   0.9334  
## stability:lakeid^6             5.326e-02  4.122e-01   0.129   0.8973  
## anoxia_summer:lakeid.L         1.285e-01  4.654e-01   0.276   0.7828  
## anoxia_summer:lakeid.Q        -1.687e-02  4.508e-01  -0.037   0.9702  
## anoxia_summer:lakeid.C         4.096e-02  4.174e-01   0.098   0.9219  
## anoxia_summer:lakeid^4         1.103e-01  4.878e-01   0.226   0.8213  
## anoxia_summer:lakeid^5        -3.720e-01  3.631e-01  -1.025   0.3069  
## anoxia_summer:lakeid^6         2.768e-01  4.140e-01   0.669   0.5046  
## secchi_openwater_max:lakeid.L  5.459e-01  2.371e-01   2.302   0.0225 *
## secchi_openwater_max:lakeid.Q -4.266e-01  2.379e-01  -1.793   0.0746 .
## secchi_openwater_max:lakeid.C -1.680e-01  2.245e-01  -0.748   0.4552  
## secchi_openwater_max:lakeid^4 -7.175e-02  1.943e-01  -0.369   0.7123  
## secchi_openwater_max:lakeid^5 -1.379e-01  2.111e-01  -0.653   0.5143  
## secchi_openwater_max:lakeid^6  2.000e-01  2.030e-01   0.985   0.3258  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.56 on 180 degrees of freedom
##   (37 observations deleted due to missingness)
## Multiple R-squared:  0.3068, Adjusted R-squared:  0.09503 
## F-statistic: 1.449 on 55 and 180 DF,  p-value: 0.03679
n_lakes_wide$doc_predict = predict(doc_model, n_lakes_wide)

cor = n_lakes_wide %>% 
  group_by(lakeid) %>% 
  summarise(r = paste("r =", round(cor(doc_epiMax, doc_predict, use="complete.obs"), 3)))

n_lakes_wide %>% 
  ggplot(aes(x=doc_epiMax, y=doc_predict, color=as.character(lakeid))) +
  geom_point()+
  theme_bw() +
  labs(color="lakeid")+
  geom_abline(slope=1, intercept = 0) + 
  facet_wrap(~lakeid) + 
  geom_text(aes(label=r), data=cor, x=300, y=0) +
  labs(x="obs peak DOC (daynum)",  y="modeled peak DOC (daynum)")
## Warning: Removed 37 rows containing missing values (`geom_point()`).

# not good but vaguely positive relationship; use this instead of median?
missing_doc = n_lakes_wide %>% filter(is.na(doc_epiMax))

missing_doc$doc_fill = round(predict(doc_model, missing_doc))

doc_fill = missing_doc %>% 
  select(lakeid, year, doc_fill) %>% 
  rename(daynum_fill = doc_fill) %>% 
  mutate(metric = "doc_epiMax") %>% 
  filter(year < 2000)

Create filled metric column and filled

all_fill = bind_rows(doc_fill)
dat_filled = full_join(dat, all_fill) %>% 
  mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, FALSE)) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum) & !is.na(daynum_fill), daynum_fill, daynum))
## Joining, by = c("lakeid", "metric", "year")
dat_filled$lakeid = factor(dat_filled$lakeid, 
                    levels = c("AL", "BM", "CB", "CR", "SP", "TB", "TR", "FI", "ME", "MO", "WI"), 
                    ordered = T)
vars = c("iceoff", "straton", "secchi_all", "secchi_openwater_max","secchi_openwater_min", "zoopDensity","zoopDensity_CC",
        "doc_epiMax","totpuf_epiMax", "totpuf_epiMin", "totpuf_hypoMax", "totpuf_hypoMin", 
        "anoxia", "anoxia_summer", "stability", "energy", "stratoff", "iceon")
dat_filled$metric = factor(dat_filled$metric,
                    levels = rev(vars),
                    ordered=T)

dat_filled %>% 
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

## Additional trimming/filling

Trim to “good” years, fill FI ice data with Monona, then fill additional missing with median values for each lake/metric

df_yearsWant = dat_filled %>% 
  filter((lakeid %in% c("FI", "ME", "MO", "WI") & year %in% 1996:2018) |
           (!(lakeid %in% c("FI", "ME", "MO", "WI")) & year %in% 1982:2018))

fi_icefill = df_yearsWant %>% 
  filter(lakeid == "MO" & metric %in% c("iceoff", "iceon")) %>% 
  mutate(lakeid = "FI") %>% 
  select(-daynum, -filled_flag, -sampledate) %>% 
  rename(icefill = daynum_fill)

df_yearsWant = df_yearsWant %>% 
  full_join(fi_icefill) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum_fill) & !is.na(icefill), 
                              icefill, daynum),
         filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill) & !is.na(icefill), 
                              TRUE, filled_flag))
## Joining, by = c("lakeid", "metric", "year")
df_yearsWant %>% 
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

Final fill w/ medians

all_lys = df_yearsWant %>% 
  select(lakeid, year) %>% 
  distinct()

metric = unique(df_yearsWant$metric)

all_lys_want = expand_grid(all_lys, metric)

dat_final = left_join(all_lys_want, df_yearsWant)
## Joining, by = c("lakeid", "year", "metric")
medians = dat_final %>% 
  group_by(lakeid, metric) %>% 
  summarise(median_daynum = median(daynum_fill, na.rm=T))
## `summarise()` has grouped output by 'lakeid'. You can override using the
## `.groups` argument.
dat_final = dat_final %>% 
  left_join(medians) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum_fill), median_daynum, daynum_fill)) %>% 
  mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, filled_flag)) %>% 
  select(-icefill, -median_daynum)
## Joining, by = c("lakeid", "metric")
dat_final %>%  
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

write_csv(dat_final, "Data/analysis_ready/final_combined_dates_filled_v2.csv")